{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.model_selection import train_test_split,GridSearchCV\n",
    "from sklearn.svm import SVC\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. This problem involves the OJ data set which is part of the ISLR package.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1070, 18)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Purchase</th>\n",
       "      <th>WeekofPurchase</th>\n",
       "      <th>StoreID</th>\n",
       "      <th>PriceCH</th>\n",
       "      <th>PriceMM</th>\n",
       "      <th>DiscCH</th>\n",
       "      <th>DiscMM</th>\n",
       "      <th>SpecialCH</th>\n",
       "      <th>SpecialMM</th>\n",
       "      <th>LoyalCH</th>\n",
       "      <th>SalePriceMM</th>\n",
       "      <th>SalePriceCH</th>\n",
       "      <th>PriceDiff</th>\n",
       "      <th>Store7</th>\n",
       "      <th>PctDiscMM</th>\n",
       "      <th>PctDiscCH</th>\n",
       "      <th>ListPriceDiff</th>\n",
       "      <th>STORE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CH</td>\n",
       "      <td>237</td>\n",
       "      <td>1</td>\n",
       "      <td>1.75</td>\n",
       "      <td>1.99</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>1.99</td>\n",
       "      <td>1.75</td>\n",
       "      <td>0.24</td>\n",
       "      <td>No</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CH</td>\n",
       "      <td>239</td>\n",
       "      <td>1</td>\n",
       "      <td>1.75</td>\n",
       "      <td>1.99</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.600000</td>\n",
       "      <td>1.69</td>\n",
       "      <td>1.75</td>\n",
       "      <td>-0.06</td>\n",
       "      <td>No</td>\n",
       "      <td>0.150754</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.24</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CH</td>\n",
       "      <td>245</td>\n",
       "      <td>1</td>\n",
       "      <td>1.86</td>\n",
       "      <td>2.09</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.680000</td>\n",
       "      <td>2.09</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.40</td>\n",
       "      <td>No</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.091398</td>\n",
       "      <td>0.23</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>MM</td>\n",
       "      <td>227</td>\n",
       "      <td>1</td>\n",
       "      <td>1.69</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>1.69</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.00</td>\n",
       "      <td>No</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>CH</td>\n",
       "      <td>228</td>\n",
       "      <td>7</td>\n",
       "      <td>1.69</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.956535</td>\n",
       "      <td>1.69</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.00</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Purchase  WeekofPurchase  StoreID  PriceCH  PriceMM  DiscCH  DiscMM  \\\n",
       "1       CH             237        1     1.75     1.99    0.00     0.0   \n",
       "2       CH             239        1     1.75     1.99    0.00     0.3   \n",
       "3       CH             245        1     1.86     2.09    0.17     0.0   \n",
       "4       MM             227        1     1.69     1.69    0.00     0.0   \n",
       "5       CH             228        7     1.69     1.69    0.00     0.0   \n",
       "\n",
       "   SpecialCH  SpecialMM   LoyalCH  SalePriceMM  SalePriceCH  PriceDiff Store7  \\\n",
       "1          0          0  0.500000         1.99         1.75       0.24     No   \n",
       "2          0          1  0.600000         1.69         1.75      -0.06     No   \n",
       "3          0          0  0.680000         2.09         1.69       0.40     No   \n",
       "4          0          0  0.400000         1.69         1.69       0.00     No   \n",
       "5          0          0  0.956535         1.69         1.69       0.00    Yes   \n",
       "\n",
       "   PctDiscMM  PctDiscCH  ListPriceDiff  STORE  \n",
       "1   0.000000   0.000000           0.24      1  \n",
       "2   0.150754   0.000000           0.24      1  \n",
       "3   0.000000   0.091398           0.23      1  \n",
       "4   0.000000   0.000000           0.00      1  \n",
       "5   0.000000   0.000000           0.00      0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\OJ.csv',index_col = 0)\n",
    "print(data.shape)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#encode the categorical variables\n",
    "data['Purchase'] = data['Purchase'].map({'CH':1,'MM':0})\n",
    "data['Store7'] = data['Store7'].map({'Yes':1,'No':0})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (a) Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(800, 17) (270, 17)\n"
     ]
    }
   ],
   "source": [
    "X_train,X_test,y_train,y_test = train_test_split(data.drop('Purchase',axis=1),data['Purchase'],test_size = 0.2523,random_state = 1)\n",
    "print(X_train.shape,X_test.shape)\n",
    "#we want 800 observations in train, so manage the test_size according to that (270/1070)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (b) Fit a support vector classifier to the training data using cost=0.01, with Purchase as the response and the other variables as predictors. Use the summary() function to produce summary statistics, and describe the results obtained\n",
    "\n",
    " \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',\n",
       "    kernel='linear', max_iter=-1, probability=False, random_state=None,\n",
       "    shrinking=True, tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = SVC(kernel = 'linear',C = 0.01)\n",
    "model.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of support vectors for each class are  [304 308]\n"
     ]
    }
   ],
   "source": [
    "print('Number of support vectors for each class are ',model.n_support_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (c) What are the training and test error rates?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Accuracy is  0.69\n",
      "Test Accuracy is  0.6407407407407407\n"
     ]
    }
   ],
   "source": [
    "print('Training Accuracy is ',model.score(X_train,y_train))\n",
    "print('Test Accuracy is ',model.score(X_test,y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (d) Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Lenovo\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
      "  warnings.warn(CV_WARNING, FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "search = GridSearchCV(SVC(kernel = 'linear'),param_grid={'C':np.logspace(-2,2,5)})\n",
    "search.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "search.best_params"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (e) Compute the training and test error rates using this new value for cost.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "best_model = search.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Training Accuracy ',best_model.score(X_train,y_train))\n",
    "print('Test Accuracy 'best_model.score(X_test,y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (f) Repeat parts (b) through (e) using a support vector machine with a radial kernel. Use the default value for gamma.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SVC(kernel = 'rbf',C = 0.01)\n",
    "model.fit(X_train,y_train)\n",
    "\n",
    "print('Number of support vectors for each class are ',model.n_support_)\n",
    "\n",
    "\n",
    "print('Training Accuracy for starting model ',model.score(X_train,y_train))\n",
    "print('Test Accuracy for starting model ',model.score(X_test,y_test))\n",
    "\n",
    "\n",
    "search = GridSearchCV(SVC(kernel = 'rbf'),param_grid={'C':np.logspace(-2,2,5)})\n",
    "search.fit(X_train,y_train)\n",
    "\n",
    "print('Best value of C is ',search.best_params)\n",
    "\n",
    "best_model = search.best_estimator_\n",
    "\n",
    "print('Training Accuracy radial',best_model.score(X_train,y_train))\n",
    "print('Test Accuracy radial'best_model.score(X_test,y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (g) Repeat parts (b) through (e) using a support vector machine with a polynomial kernel. Set degree=2.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SVC(kernel = 'poly',degree = 2,C = 0.01)\n",
    "model.fit(X_train,y_train)\n",
    "\n",
    "print('Number of support vectors for each class are ',model.n_support_)\n",
    "\n",
    "\n",
    "print('Training Accuracy for starting model ',model.score(X_train,y_train))\n",
    "print('Test Accuracy for starting model ',model.score(X_test,y_test))\n",
    "\n",
    "\n",
    "search = GridSearchCV(SVC(kernel = 'poly',degree = 2),param_grid={'C':np.logspace(-2,2,5)})\n",
    "search.fit(X_train,y_train)\n",
    "\n",
    "print('Best value of C is ',search.best_params)\n",
    "\n",
    "best_model = search.best_estimator_\n",
    "\n",
    "print('Training Accuracy radial',best_model.score(X_train,y_train))\n",
    "print('Test Accuracy radial'best_model.score(X_test,y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### (h) Overall, which approach seems to give the best results on this data?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Please look at the above results, and select which will perform the best. My CPU just gave up..."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
